directory <- "/cloud/project" # Class option when coding on RStudio Cloud
# Will need to update if working on personal computer
# Check the file path
file.path(directory, "nhanes3.rda")
## [1] "/cloud/project/nhanes3.rda"
# Load the saved R data
load(file.path(directory, "nhanes3.rda"))
# Remake a few variables from last class if they are no longer in your environment
sex1 <- factor(nhanes$sex, levels = c(1, 2), labels = c("male", "female"))
AGE5b <- cut(nhanes$age, quantile(nhanes$age, c(0, .2, .4, .6, .8, 1)), include.lowest = T) # quintiles
AGE5c <- cut(nhanes$age, breaks = c(19, 40, 50, 60, 70, 90))
age5c <- unclass(AGE5c)
nhanes <- cbind(nhanes, sex1, AGE5b, AGE5c, age5c)
# Clean up the dataset. Make sure NaN are NA for key covariates
table(nhanes$educ, useNA = "always")
##
## 1 2 3 NaN <NA>
## 2032 2349 660 33 0
nhanes$educ[is.nan(nhanes$educ)] <- NA
table(nhanes$educ, useNA = "always")
##
## 1 2 3 <NA>
## 2032 2349 660 33
table(nhanes$alc, useNA = "always")
##
## 0 1 NaN <NA>
## 2753 2189 132 0
nhanes$alc[is.nan(nhanes$alc)] <- NA
table(nhanes$alc, useNA = "always")
##
## 0 1 <NA>
## 2753 2189 132
## Does the distribution of log(sbp) look closer to the normal distribution?
par(mfrow = c(2, 1))
hist(nhanes$sbp, nclass = 20, col = "darkorchid")
hist(log(nhanes$sbp), nclass = 20, col = "seagreen3")
## Let's start with non-log transformed SBP first. Look at bivariate association between sbp and continuous covariates
par(mfrow = c(1, 1))
plot(nhanes$age, nhanes$sbp, pch = 20, cex = 0.7, col = "dodgerblue", cex.lab = 1.2, las = 1, ylab = "SBP", xlab = "Age (years)")
lines(smooth.spline(nhanes$age, nhanes$sbp, df = 10), col = "dodgerblue", lwd = 3)
plot(nhanes$bmi, nhanes$sbp, pch = 20, cex = 0.7, col = "dodgerblue", cex.lab = 1.2, las = 1, ylab = "SBP", xlab = "BMI (kg/m2)")
lines(smooth.spline(na.omit(nhanes$bmi), nhanes$sbp[na.omit(nhanes$bmi)], df = 3, ), col = "grey60", lwd = 3)
plot(nhanes$bpb, nhanes$sbp, pch = 20, cex = 0.7, col = "dodgerblue", cex.lab = 1.2, las = 1, ylab = "SBP", xlab = "Blood lead level (ug/dL)")
lines(smooth.spline(nhanes$bpb, nhanes$sbp, df = 10), col = "grey60", lwd = 3)
## Start creating a simple regression model for systolic blood pressure,
## including only blood lead (bpb) (crude model).
sbp.model <- lm(sbp ~ bpb, na.action = na.omit, data = nhanes)
sbp.model
##
## Call:
## lm(formula = sbp ~ bpb, data = nhanes, na.action = na.omit)
##
## Coefficients:
## (Intercept) bpb
## 121.666 1.162
summary(sbp.model) # Is blood lead associated with SBP? Is this the direction you would expect?
##
## Call:
## lm(formula = sbp ~ bpb, data = nhanes, na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -51.586 -13.964 -3.757 10.612 114.521
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 121.66581 0.43552 279.36 <2e-16 ***
## bpb 1.16166 0.08528 13.62 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.6 on 5072 degrees of freedom
## Multiple R-squared: 0.03529, Adjusted R-squared: 0.0351
## F-statistic: 185.6 on 1 and 5072 DF, p-value: < 2.2e-16
summary.aov(sbp.model)
## Df Sum Sq Mean Sq F value Pr(>F)
## bpb 1 71318 71318 185.6 <2e-16 ***
## Residuals 5072 1949387 384
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(sbp.model)
## Analysis of Variance Table
##
## Response: sbp
## Df Sum Sq Mean Sq F value Pr(>F)
## bpb 1 71318 71318 185.56 < 2.2e-16 ***
## Residuals 5072 1949387 384
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Add age in the model
sbp.model1 <- lm(sbp ~ bpb + age, na.action = na.omit, data = nhanes)
summary(sbp.model1) # Does anything change with age in the model?
##
## Call:
## lm(formula = sbp ~ bpb + age, data = nhanes, na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -62.405 -10.431 -1.427 8.470 101.240
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 95.00660 0.63062 150.655 < 2e-16 ***
## bpb 0.46641 0.07063 6.604 4.41e-11 ***
## age 0.60338 0.01181 51.077 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.93 on 5071 degrees of freedom
## Multiple R-squared: 0.363, Adjusted R-squared: 0.3628
## F-statistic: 1445 on 2 and 5071 DF, p-value: < 2.2e-16
## Add race, which should be a categorical variable. We can use factor() or as.factor() to create indicator variables for each value of race
table(nhanes$race)
##
## 1 2
## 3634 1440
sbp.model2 <- lm(sbp ~ bpb + age + factor(race), na.action = na.omit, data = nhanes)
summary(sbp.model2)
##
## Call:
## lm(formula = sbp ~ bpb + age + factor(race), data = nhanes, na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -61.773 -10.420 -1.257 8.525 101.680
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 93.88241 0.66373 141.446 < 2e-16 ***
## bpb 0.42848 0.07080 6.052 1.53e-09 ***
## age 0.61400 0.01195 51.378 < 2e-16 ***
## factor(race)2 2.66690 0.50308 5.301 1.20e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.89 on 5070 degrees of freedom
## Multiple R-squared: 0.3665, Adjusted R-squared: 0.3661
## F-statistic: 977.8 on 3 and 5070 DF, p-value: < 2.2e-16
#### Change the reference level for a factor variable
# Method 1: Make a new (or alter the old) variable
table(nhanes$race)
##
## 1 2
## 3634 1440
race2 <- relevel(factor(nhanes$race), ref = 2)
table(race2)
## race2
## 2 1
## 1440 3634
sbp.model2 <- lm(sbp ~ bpb + age + factor(race2), na.action = na.omit, data = nhanes)
summary(sbp.model2)
##
## Call:
## lm(formula = sbp ~ bpb + age + factor(race2), data = nhanes,
## na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -61.773 -10.420 -1.257 8.525 101.680
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 96.54931 0.69301 139.319 < 2e-16 ***
## bpb 0.42848 0.07080 6.052 1.53e-09 ***
## age 0.61400 0.01195 51.378 < 2e-16 ***
## factor(race2)1 -2.66690 0.50308 -5.301 1.20e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.89 on 5070 degrees of freedom
## Multiple R-squared: 0.3665, Adjusted R-squared: 0.3661
## F-statistic: 977.8 on 3 and 5070 DF, p-value: < 2.2e-16
# Method 2: Change the contrasts in the lm() statement
sbp.model2 <- lm(sbp ~ bpb + age + C(factor(race), contr.treatment(2, base = 2)), na.action = na.omit, data = nhanes)
summary(sbp.model2)
##
## Call:
## lm(formula = sbp ~ bpb + age + C(factor(race), contr.treatment(2,
## base = 2)), data = nhanes, na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -61.773 -10.420 -1.257 8.525 101.680
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 96.54931 0.69301 139.319
## bpb 0.42848 0.07080 6.052
## age 0.61400 0.01195 51.378
## C(factor(race), contr.treatment(2, base = 2))1 -2.66690 0.50308 -5.301
## Pr(>|t|)
## (Intercept) < 2e-16 ***
## bpb 1.53e-09 ***
## age < 2e-16 ***
## C(factor(race), contr.treatment(2, base = 2))1 1.20e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.89 on 5070 degrees of freedom
## Multiple R-squared: 0.3665, Adjusted R-squared: 0.3661
## F-statistic: 977.8 on 3 and 5070 DF, p-value: < 2.2e-16
# R provides Type I sequential SS, not the default Type III marginal SS reported by SAS and SPSS.
# We can use the drop1() function to produce the familiar Type III results.
# It will compare each term with the full model.
anova(sbp.model2)
## Analysis of Variance Table
##
## Response: sbp
## Df Sum Sq Mean Sq F value
## bpb 1 71318 71318 282.470
## age 1 662217 662217 2622.848
## C(factor(race), contr.treatment(2, base = 2)) 1 7095 7095 28.102
## Residuals 5070 1280075 252
## Pr(>F)
## bpb < 2.2e-16 ***
## age < 2.2e-16 ***
## C(factor(race), contr.treatment(2, base = 2)) 1.199e-07 ***
## Residuals
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
drop1(sbp.model2, test = "F")
## Single term deletions
##
## Model:
## sbp ~ bpb + age + C(factor(race), contr.treatment(2, base = 2))
## Df Sum of Sq RSS AIC
## <none> 1280075 28070
## bpb 1 9247 1289322 28104
## age 1 666469 1946544 30195
## C(factor(race), contr.treatment(2, base = 2)) 1 7095 1287170 28096
## F value Pr(>F)
## <none>
## bpb 36.625 1.535e-09 ***
## age 2639.688 < 2.2e-16 ***
## C(factor(race), contr.treatment(2, base = 2)) 28.102 1.199e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Add other covariates to the model: which variables are biologically important?
## Add sex, BMI, educ and smk.
sbp.model3 <- lm(sbp ~ bpb + age + factor(race) + factor(sex) + bmi + factor(educ) + factor(smk), na.action = na.omit, data = nhanes)
summary(sbp.model3) # What covariates are associated with SBP? Do they make sense biologically?
##
## Call:
## lm(formula = sbp ~ bpb + age + factor(race) + factor(sex) + bmi +
## factor(educ) + factor(smk), data = nhanes, na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -59.202 -9.583 -1.317 7.926 101.116
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 83.160324 1.346404 61.765 < 2e-16 ***
## bpb 0.277763 0.075759 3.666 0.000249 ***
## age 0.615200 0.012258 50.189 < 2e-16 ***
## factor(race)2 2.024195 0.498277 4.062 4.93e-05 ***
## factor(sex)2 -3.912620 0.476734 -8.207 2.85e-16 ***
## bmi 0.530437 0.038131 13.911 < 2e-16 ***
## factor(educ)2 -0.003008 0.484418 -0.006 0.995046
## factor(educ)3 -2.675500 0.711182 -3.762 0.000170 ***
## factor(smk)2 -1.991441 0.562961 -3.537 0.000408 ***
## factor(smk)3 -0.270575 0.559123 -0.484 0.628459
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.45 on 5021 degrees of freedom
## (43 observations deleted due to missingness)
## Multiple R-squared: 0.3977, Adjusted R-squared: 0.3966
## F-statistic: 368.4 on 9 and 5021 DF, p-value: < 2.2e-16
# See output from models 1,2,and 3 side by side
stargazer(sbp.model1, sbp.model2, sbp.model3, type = "text", dep.var.labels = "Systolic Blood Pressure (mmHg)")
##
## ==============================================================================================================================
## Dependent variable:
## -------------------------------------------------------------------------------
## Systolic Blood Pressure (mmHg)
## (1) (2) (3)
## ------------------------------------------------------------------------------------------------------------------------------
## bpb 0.466*** 0.428*** 0.278***
## (0.071) (0.071) (0.076)
##
## age 0.603*** 0.614*** 0.615***
## (0.012) (0.012) (0.012)
##
## C(factor(race), contr.treatment(2, base = 2))1 -2.667***
## (0.503)
##
## factor(race)2 2.024***
## (0.498)
##
## factor(sex)2 -3.913***
## (0.477)
##
## bmi 0.530***
## (0.038)
##
## factor(educ)2 -0.003
## (0.484)
##
## factor(educ)3 -2.675***
## (0.711)
##
## factor(smk)2 -1.991***
## (0.563)
##
## factor(smk)3 -0.271
## (0.559)
##
## Constant 95.007*** 96.549*** 83.160***
## (0.631) (0.693) (1.346)
##
## ------------------------------------------------------------------------------------------------------------------------------
## Observations 5,074 5,074 5,031
## R2 0.363 0.367 0.398
## Adjusted R2 0.363 0.366 0.397
## Residual Std. Error 15.932 (df = 5071) 15.890 (df = 5070) 15.445 (df = 5021)
## F Statistic 1,444.936*** (df = 2; 5071) 977.807*** (df = 3; 5070) 368.374*** (df = 9; 5021)
## ==============================================================================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
### Check if alcohol consumption is a confounder
sbp.model4 <- lm(sbp ~ bpb + age + factor(race) + factor(sex) + bmi + factor(educ) + factor(smk) + factor(alc), na.action = na.omit, data = nhanes)
summary(sbp.model4) # Is alcohol associated with sbp?
##
## Call:
## lm(formula = sbp ~ bpb + age + factor(race) + factor(sex) + bmi +
## factor(educ) + factor(smk) + factor(alc), data = nhanes,
## na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -58.495 -9.580 -1.311 7.928 101.235
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 82.33776 1.41399 58.231 < 2e-16 ***
## bpb 0.24874 0.07705 3.228 0.001253 **
## age 0.61361 0.01276 48.106 < 2e-16 ***
## factor(race)2 2.00975 0.50657 3.967 7.37e-05 ***
## factor(sex)2 -3.87208 0.49390 -7.840 5.50e-15 ***
## bmi 0.55795 0.03860 14.454 < 2e-16 ***
## factor(educ)2 0.02761 0.49227 0.056 0.955279
## factor(educ)3 -2.71562 0.72412 -3.750 0.000179 ***
## factor(smk)2 -2.07944 0.56937 -3.652 0.000263 ***
## factor(smk)3 -0.20883 0.57141 -0.365 0.714779
## factor(alc)1 0.41798 0.49359 0.847 0.397144
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.38 on 4891 degrees of freedom
## (172 observations deleted due to missingness)
## Multiple R-squared: 0.3967, Adjusted R-squared: 0.3954
## F-statistic: 321.6 on 10 and 4891 DF, p-value: < 2.2e-16
summary(sbp.model4)$coef # Pull out relevant information from the output: Coefficient table
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 82.3377552 1.41399277 58.23067609 0.000000e+00
## bpb 0.2487418 0.07704799 3.22840133 1.253061e-03
## age 0.6136075 0.01275532 48.10601736 0.000000e+00
## factor(race)2 2.0097527 0.50656503 3.96741297 7.369686e-05
## factor(sex)2 -3.8720755 0.49390486 -7.83971931 5.502517e-15
## bmi 0.5579520 0.03860066 14.45446563 2.100456e-46
## factor(educ)2 0.0276076 0.49227488 0.05608169 9.552790e-01
## factor(educ)3 -2.7156205 0.72411551 -3.75025867 1.786946e-04
## factor(smk)2 -2.0794440 0.56937030 -3.65218209 2.627418e-04
## factor(smk)3 -0.2088304 0.57140734 -0.36546680 7.147788e-01
## factor(alc)1 0.4179774 0.49359253 0.84680655 3.971444e-01
summary(sbp.model4)$coef[2, 1] # Pull out just the beta coefficient blood Pb
## [1] 0.2487418
# 10% guideline for confounders: Does the addition of the new variable change the beta coefficient of interest by >10%?
(summary(sbp.model4)$coef[2, 1] - summary(sbp.model3)$coef[2, 1]) / summary(sbp.model3)$coef[2, 1] # Calculate the percent change in the blood Pb coefficient before and after alcohol in the model
## [1] -0.1044827
# Does alcohol meet the guideline for a confounder?
## Compute difference in SBP and 95% CI per one unit increase and IQR increase in bpb
# For one unit increase in exposure
summary(sbp.model4)$coef[2, 1] # Beta coefficient
## [1] 0.2487418
summary(sbp.model4)$coef[2, 1] - 1.96 * summary(sbp.model4)$coef[2, 2] # Lower confidence interval
## [1] 0.09772778
summary(sbp.model4)$coef[2, 1] + 1.96 * summary(sbp.model4)$coef[2, 2] # Upper confidence interval
## [1] 0.3997559
regress.display(sbp.model4) # Alternative: Convenience function to show all of the CI with the beta coefficients
##
## Coeff lower095ci upper095ci Pr>|t|
## (Intercept) 82.3377552 79.56569429 85.1098161 0.000000e+00
## bpb 0.2487418 0.09769317 0.3997905 1.253061e-03
## age 0.6136075 0.58860131 0.6386136 0.000000e+00
## factor(race)2 2.0097527 1.01665770 3.0028476 7.369686e-05
## factor(sex)2 -3.8720755 -4.84035080 -2.9038001 5.502517e-15
## bmi 0.5579520 0.48227733 0.6336266 2.100456e-46
## factor(educ)2 0.0276076 -0.93747225 0.9926875 9.552790e-01
## factor(educ)3 -2.7156205 -4.13521210 -1.2960289 1.786946e-04
## factor(smk)2 -2.0794440 -3.19566554 -0.9632225 2.627418e-04
## factor(smk)3 -0.2088304 -1.32904543 0.9113846 7.147788e-01
## factor(alc)1 0.4179774 -0.54968566 1.3856404 3.971444e-01
# To improve communication of findings, report association for an IQR increase in exposure
IQR(nhanes$bpb)
## [1] 3.1
IQR(nhanes$bpb) * summary(sbp.model4)$coef[2, 1] # Per IQR increase in exposure, beta coefficient
## [1] 0.7710997
l95ci.iqr <- IQR(nhanes$bpb) * (summary(sbp.model4)$coef[2, 1] - 1.96 * summary(sbp.model4)$coef[2, 2]) # Per IQR increase in exposure, lower confidence interval
u95ci.iqr <- IQR(nhanes$bpb) * (summary(sbp.model4)$coef[2, 1] + 1.96 * summary(sbp.model4)$coef[2, 2]) # Per IQR increase in exposure, upper confidence interval
regress.display(sbp.model4)$table[2, ] * IQR(nhanes$bpb) # Multiply the convenience table by the IQR
## Coeff lower095ci upper095ci Pr>|t|
## 0.77109973 0.30284884 1.23935061 0.00388449
## Does the variable packyrs give a better estimate than smk?
sbp.model5 <- lm(sbp ~ bpb + age + factor(race) + factor(sex) + bmi + factor(educ) + packyrs + factor(alc), na.action = na.omit, data = nhanes)
summary(sbp.model5)
##
## Call:
## lm(formula = sbp ~ bpb + age + factor(race) + factor(sex) + bmi +
## factor(educ) + packyrs + factor(alc), data = nhanes, na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -59.679 -9.666 -1.233 8.026 101.502
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 82.04513 1.42080 57.746 < 2e-16 ***
## bpb 0.24963 0.07906 3.157 0.001602 **
## age 0.61464 0.01317 46.658 < 2e-16 ***
## factor(race)2 2.22727 0.51892 4.292 1.81e-05 ***
## factor(sex)2 -3.70826 0.50585 -7.331 2.68e-13 ***
## bmi 0.55073 0.03930 14.012 < 2e-16 ***
## factor(educ)2 0.04590 0.50476 0.091 0.927547
## factor(educ)3 -2.60357 0.73817 -3.527 0.000424 ***
## packyrs -0.02462 0.01097 -2.244 0.024860 *
## factor(alc)1 0.32565 0.49934 0.652 0.514333
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.42 on 4677 degrees of freedom
## (387 observations deleted due to missingness)
## Multiple R-squared: 0.3952, Adjusted R-squared: 0.394
## F-statistic: 339.6 on 9 and 4677 DF, p-value: < 2.2e-16
AIC(sbp.model4)
## [1] 40722.09
AIC(sbp.model5)
## [1] 38957.68
## Which model has the lower AIC? The one with smk (model 4) or the one with packyrs (model 5)?
## Model 5 also dropped 215 more people. This is an analyst judgment call (no perfect answer). I would probably stick with model 4 since the AIC are close, to keep the people in the study.
## Run two models: One with age and one adding age as a quadratic function.
## Does the quadratic term improve the fit of the model?
sbp.model6 <- lm(sbp ~ bpb + age + factor(race) + factor(sex) + bmi + factor(educ) + factor(smk) + factor(alc) + I(age^2), na.action = na.omit, data = nhanes)
summary(sbp.model6) # Is the age squared term significant?
##
## Call:
## lm(formula = sbp ~ bpb + age + factor(race) + factor(sex) + bmi +
## factor(educ) + factor(smk) + factor(alc) + I(age^2), data = nhanes,
## na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -59.724 -9.518 -1.413 7.689 101.809
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 87.0142129 1.8737032 46.440 < 2e-16 ***
## bpb 0.2582957 0.0769837 3.355 0.000799 ***
## age 0.3452699 0.0718091 4.808 1.57e-06 ***
## factor(race)2 2.0928803 0.5063451 4.133 3.64e-05 ***
## factor(sex)2 -3.7960138 0.4936354 -7.690 1.77e-14 ***
## bmi 0.5913718 0.0395399 14.956 < 2e-16 ***
## factor(educ)2 0.0989887 0.4919604 0.201 0.840541
## factor(educ)3 -2.4208295 0.7272801 -3.329 0.000879 ***
## factor(smk)2 -1.8118191 0.5729428 -3.162 0.001575 **
## factor(smk)3 0.0969062 0.5762782 0.168 0.866465
## factor(alc)1 0.4773454 0.4931648 0.968 0.333131
## I(age^2) 0.0026033 0.0006856 3.797 0.000148 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.36 on 4890 degrees of freedom
## (172 observations deleted due to missingness)
## Multiple R-squared: 0.3984, Adjusted R-squared: 0.3971
## F-statistic: 294.4 on 11 and 4890 DF, p-value: < 2.2e-16
## Use the anova function to compare the 2 models and
## see if the quadratic term improves the model.
anova(sbp.model4, sbp.model6, test = "F")
## Analysis of Variance Table
##
## Model 1: sbp ~ bpb + age + factor(race) + factor(sex) + bmi + factor(educ) +
## factor(smk) + factor(alc)
## Model 2: sbp ~ bpb + age + factor(race) + factor(sex) + bmi + factor(educ) +
## factor(smk) + factor(alc) + I(age^2)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 4891 1157606
## 2 4890 1154203 1 3403 14.418 0.0001482 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## We'll use sbp.model6 as our final model
## What is the relationship between bpb and sbp?
### Regression Diagnostics
## In the case of linear model, the plot of the model gives diagnostic plots
par(mfrow = c(1, 1))
plot(sbp.model6)
par(mfrow = c(2, 2))
plot(sbp.model6, id.n = 5, cex = 0.1)
# plot.lm(sbp.model6)
par(mfrow = c(1, 1))
plot(sbp.model6, which = 1) # Can ask for the four plots one at a time
plot(sbp.model6, which = 2)
plot(sbp.model6, which = 3)
plot(sbp.model6, which = 5)
plot(sbp.model6, which = 4) # There is an extra, hidden plot you can call out individually
## How about log(sbp)?
## Check how diagnostic plots using log-transformed sbp look like
sbp.model6.log <- lm(log(sbp) ~ bpb + age + factor(race) + factor(sex) + bmi + factor(educ) + factor(smk) + factor(alc) + I(age^2), na.action = na.omit, data = nhanes)
summary(sbp.model6.log)
##
## Call:
## lm(formula = log(sbp) ~ bpb + age + factor(race) + factor(sex) +
## bmi + factor(educ) + factor(smk) + factor(alc) + I(age^2),
## data = nhanes, na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.53660 -0.07304 -0.00487 0.06676 0.57499
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.505e+00 1.401e-02 321.638 < 2e-16 ***
## bpb 1.974e-03 5.755e-04 3.430 0.000609 ***
## age 3.281e-03 5.368e-04 6.113 1.06e-09 ***
## factor(race)2 1.535e-02 3.785e-03 4.054 5.11e-05 ***
## factor(sex)2 -3.504e-02 3.690e-03 -9.496 < 2e-16 ***
## bmi 4.864e-03 2.956e-04 16.455 < 2e-16 ***
## factor(educ)2 1.928e-03 3.678e-03 0.524 0.600129
## factor(educ)3 -1.837e-02 5.437e-03 -3.379 0.000734 ***
## factor(smk)2 -1.415e-02 4.283e-03 -3.303 0.000964 ***
## factor(smk)3 5.033e-04 4.308e-03 0.117 0.906992
## factor(alc)1 5.098e-03 3.687e-03 1.383 0.166742
## I(age^2) 1.389e-05 5.125e-06 2.710 0.006758 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1148 on 4890 degrees of freedom
## (172 observations deleted due to missingness)
## Multiple R-squared: 0.4159, Adjusted R-squared: 0.4146
## F-statistic: 316.5 on 11 and 4890 DF, p-value: < 2.2e-16
plot(sbp.model6.log)
hist(residuals(sbp.model6.log), nclass = 20)
par(mfrow = c(2, 2))
plot(sbp.model6, which = 1)
plot(sbp.model6.log, which = 1)
hist(residuals(sbp.model6), main = "Histogram of res(SBP)")
hist(residuals(sbp.model6.log), main = "Histogram of res(log(SBP))")
par(mfrow = c(1, 1))
### Partial Residual Plots
## Plot sbp vs. bpb given that other variables are in the model (adjusted)
## This can be done by 'termplot'
termplot(sbp.model6, partial.resid = TRUE, col.res = "gray30", cex = 0.3, smooth = panel.smooth)
summary(sbp.model6.log)$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.504957e+00 1.400631e-02 321.6375700 0.000000e+00
## bpb 1.973727e-03 5.754686e-04 3.4297737 6.090800e-04
## age 3.281259e-03 5.367875e-04 6.1127716 1.055187e-09
## factor(race)2 1.534593e-02 3.785033e-03 4.0543714 5.105082e-05
## factor(sex)2 -3.504012e-02 3.690025e-03 -9.4959008 3.321831e-21
## bmi 4.863685e-03 2.955687e-04 16.4553462 2.932179e-59
## factor(educ)2 1.927926e-03 3.677504e-03 0.5242485 6.001295e-01
## factor(educ)3 -1.836873e-02 5.436567e-03 -3.3787365 7.338916e-04
## factor(smk)2 -1.414501e-02 4.282864e-03 -3.3026976 9.644763e-04
## factor(smk)3 5.033236e-04 4.307797e-03 0.1168401 9.069915e-01
## factor(alc)1 5.098266e-03 3.686508e-03 1.3829529 1.667424e-01
## I(age^2) 1.388725e-05 5.125029e-06 2.7096926 6.758042e-03
summary(sbp.model6.log)$coef[2, 1]
## [1] 0.001973727
summary(sbp.model6.log)$coef[2, 2]
## [1] 0.0005754686
exp(summary(sbp.model6.log)$coef[2, 1])
## [1] 1.001976
# exp(sbp.model6.log$coef[2])
100 * (exp(summary(sbp.model6.log)$coef[2, 1]) - 1) # Per one unit increase in blood Pb, percent change in systolic blood pressure
## [1] 0.1975676
100 * (exp(summary(sbp.model6.log)$coef[2, 1] - 1.96 * summary(sbp.model6.log)$coef[2, 2]) - 1) # Lower confidence interval
## [1] 0.08461663
100 * (exp(summary(sbp.model6.log)$coef[2, 1] + 1.96 * summary(sbp.model6.log)$coef[2, 2]) - 1) # Upper confidence interval
## [1] 0.310646
IQR(nhanes$bpb)
## [1] 3.1
100 * (exp(IQR(nhanes$bpb) * summary(sbp.model6.log)$coef[2, 1]) - 1) # Per IQR unit increase in blood Pb, percent change in systolic blood pressure
## [1] 0.613731
100 * (exp(IQR(nhanes$bpb) * (summary(sbp.model6.log)$coef[2, 1] - 1.96 * summary(sbp.model6.log)$coef[2, 2])) - 1) # Lower confidence interval
## [1] 0.2625447
100 * (exp(IQR(nhanes$bpb) * (summary(sbp.model6.log)$coef[2, 1] + 1.96 * summary(sbp.model6.log)$coef[2, 2])) - 1) # Upper confidence interval
## [1] 0.9661474
# What if the relationship between blood lead and sbp is non-linear?
library(mgcv)
## Loading required package: nlme
## This is mgcv 1.8-33. For overview type 'help("mgcv-package")'.
##
## Attaching package: 'mgcv'
## The following object is masked from 'package:nnet':
##
## multinom
sbp.model6.gam <- gam(sbp ~ s(bpb) + age + factor(race) + factor(sex) + bmi + factor(educ) + factor(smk) + factor(alc) + I(age^2), na.action = na.omit, data = nhanes)
summary(sbp.model6.gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## sbp ~ s(bpb) + age + factor(race) + factor(sex) + bmi + factor(educ) +
## factor(smk) + factor(alc) + I(age^2)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 88.2143968 1.8617921 47.381 < 2e-16 ***
## age 0.3363437 0.0718470 4.681 2.93e-06 ***
## factor(race)2 2.0313316 0.5062947 4.012 6.11e-05 ***
## factor(sex)2 -3.5783017 0.5001717 -7.154 9.67e-13 ***
## bmi 0.5931173 0.0395125 15.011 < 2e-16 ***
## factor(educ)2 0.1625592 0.4919648 0.330 0.741090
## factor(educ)3 -2.2564780 0.7288554 -3.096 0.001973 **
## factor(smk)2 -1.8333930 0.5724650 -3.203 0.001371 **
## factor(smk)3 -0.0462355 0.5780681 -0.080 0.936254
## factor(alc)1 0.4435799 0.4929664 0.900 0.368262
## I(age^2) 0.0026407 0.0006852 3.854 0.000118 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(bpb) 2.36 3.014 6.61 0.000184 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.398 Deviance explained = 40%
## GCV = 236.21 Scale est. = 235.57 n = 4902
plot(sbp.model6.gam)
plot(sbp.model6.gam, xlab = "Blood lead (ug/dL)", ylab = "Change in log(SBP)") # Does this relationship look linear?
# Option to log transform the exposure
sbp.model7 <- lm(sbp ~ log(bpb) + age + factor(race) + factor(sex) + bmi + factor(educ) + factor(smk) + factor(alc) + I(age^2), na.action = na.omit, data = nhanes)
summary(sbp.model7)
##
## Call:
## lm(formula = sbp ~ log(bpb) + age + factor(race) + factor(sex) +
## bmi + factor(educ) + factor(smk) + factor(alc) + I(age^2),
## data = nhanes, na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -59.464 -9.543 -1.339 7.746 103.034
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 86.8708956 1.8722723 46.399 < 2e-16 ***
## log(bpb) 1.4222563 0.3526023 4.034 5.58e-05 ***
## age 0.3316940 0.0719935 4.607 4.18e-06 ***
## factor(race)2 2.0361714 0.5066138 4.019 5.93e-05 ***
## factor(sex)2 -3.5467680 0.5053567 -7.018 2.55e-12 ***
## bmi 0.5902534 0.0394943 14.945 < 2e-16 ***
## factor(educ)2 0.1417779 0.4914805 0.288 0.77300
## factor(educ)3 -2.2839018 0.7293924 -3.131 0.00175 **
## factor(smk)2 -1.8271820 0.5726692 -3.191 0.00143 **
## factor(smk)3 -0.0474241 0.5795115 -0.082 0.93478
## factor(alc)1 0.4312724 0.4933101 0.874 0.38203
## I(age^2) 0.0026700 0.0006858 3.893 0.00010 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.36 on 4890 degrees of freedom
## (172 observations deleted due to missingness)
## Multiple R-squared: 0.399, Adjusted R-squared: 0.3977
## F-statistic: 295.2 on 11 and 4890 DF, p-value: < 2.2e-16
summary(sbp.model6)
##
## Call:
## lm(formula = sbp ~ bpb + age + factor(race) + factor(sex) + bmi +
## factor(educ) + factor(smk) + factor(alc) + I(age^2), data = nhanes,
## na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -59.724 -9.518 -1.413 7.689 101.809
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 87.0142129 1.8737032 46.440 < 2e-16 ***
## bpb 0.2582957 0.0769837 3.355 0.000799 ***
## age 0.3452699 0.0718091 4.808 1.57e-06 ***
## factor(race)2 2.0928803 0.5063451 4.133 3.64e-05 ***
## factor(sex)2 -3.7960138 0.4936354 -7.690 1.77e-14 ***
## bmi 0.5913718 0.0395399 14.956 < 2e-16 ***
## factor(educ)2 0.0989887 0.4919604 0.201 0.840541
## factor(educ)3 -2.4208295 0.7272801 -3.329 0.000879 ***
## factor(smk)2 -1.8118191 0.5729428 -3.162 0.001575 **
## factor(smk)3 0.0969062 0.5762782 0.168 0.866465
## factor(alc)1 0.4773454 0.4931648 0.968 0.333131
## I(age^2) 0.0026033 0.0006856 3.797 0.000148 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.36 on 4890 degrees of freedom
## (172 observations deleted due to missingness)
## Multiple R-squared: 0.3984, Adjusted R-squared: 0.3971
## F-statistic: 294.4 on 11 and 4890 DF, p-value: < 2.2e-16
## compare model6 and model7
AIC(sbp.model6, sbp.model7) # Which model has the lower AIC? The spline term or the log term for blood Pb?
## df AIC
## sbp.model6 13 40709.65
## sbp.model7 13 40704.64
# What if the relationship between blood Pb and sbp varies by sex?
table(nhanes$sex)
##
## 1 2
## 2335 2739
### Stratified by sex
## Male (sex==1)
sbp.model6.male <- lm(sbp ~ bpb + age + factor(race) + bmi + factor(educ)
+ factor(smk) + factor(alc) + I(age^2), data = nhanes, subset = (sex == 1))
summary(sbp.model6.male) # What is the beta coefficient for blood Pb in males?
##
## Call:
## lm(formula = sbp ~ bpb + age + factor(race) + bmi + factor(educ) +
## factor(smk) + factor(alc) + I(age^2), data = nhanes, subset = (sex ==
## 1))
##
## Residuals:
## Min 1Q Median 3Q Max
## -55.951 -9.164 -1.214 7.452 67.763
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 94.0698502 2.7370216 34.369 < 2e-16 ***
## bpb 0.2201841 0.0907801 2.425 0.015367 *
## age 0.0247105 0.1020797 0.242 0.808748
## factor(race)2 2.6271948 0.7251217 3.623 0.000298 ***
## bmi 0.7190556 0.0667827 10.767 < 2e-16 ***
## factor(educ)2 -0.1580127 0.6931503 -0.228 0.819696
## factor(educ)3 -1.9583250 0.9739884 -2.011 0.044485 *
## factor(smk)2 -0.2674489 0.7894817 -0.339 0.734818
## factor(smk)3 1.5625824 0.8126909 1.923 0.054641 .
## factor(alc)1 -0.1923135 0.6709470 -0.287 0.774422
## I(age^2) 0.0043354 0.0009734 4.454 8.84e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.6 on 2240 degrees of freedom
## (84 observations deleted due to missingness)
## Multiple R-squared: 0.3134, Adjusted R-squared: 0.3103
## F-statistic: 102.3 on 10 and 2240 DF, p-value: < 2.2e-16
## Female (sex==2)
sbp.model6.female <- lm(sbp ~ bpb + age + factor(race) + bmi + factor(educ)
+ factor(smk) + factor(alc) + I(age^2), data = nhanes, subset = (sex == 2))
summary(sbp.model6.female) # What is the beta coefficient for blood Pb in females? Is it similar to that in males?
##
## Call:
## lm(formula = sbp ~ bpb + age + factor(race) + bmi + factor(educ) +
## factor(smk) + factor(alc) + I(age^2), data = nhanes, subset = (sex ==
## 2))
##
## Residuals:
## Min 1Q Median 3Q Max
## -57.975 -9.454 -1.348 7.659 99.869
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 77.0605375 2.4865167 30.991 < 2e-16 ***
## bpb 0.2931960 0.1343263 2.183 0.02914 *
## age 0.5576526 0.0990458 5.630 1.99e-08 ***
## factor(race)2 2.0982646 0.6949672 3.019 0.00256 **
## bmi 0.5305908 0.0492797 10.767 < 2e-16 ***
## factor(educ)2 0.3947007 0.6828269 0.578 0.56329
## factor(educ)3 -2.1735912 1.0641023 -2.043 0.04119 *
## factor(smk)2 -1.4335988 0.8458868 -1.695 0.09023 .
## factor(smk)3 -0.5512632 0.8104050 -0.680 0.49642
## factor(alc)1 0.6533917 0.7063162 0.925 0.35501
## I(age^2) 0.0016206 0.0009452 1.715 0.08655 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.66 on 2640 degrees of freedom
## (88 observations deleted due to missingness)
## Multiple R-squared: 0.4611, Adjusted R-squared: 0.4591
## F-statistic: 225.9 on 10 and 2640 DF, p-value: < 2.2e-16
### Use interaction model
sbp.model6.int <- lm(sbp ~ bpb + factor(sex) + bpb * factor(sex) + age + factor(race) + bmi + factor(educ)
+ factor(smk) + factor(alc) + I(age^2), data = nhanes)
# below is the same
sbp.model6.int <- lm(sbp ~ bpb * factor(sex) + age + factor(race) + bmi + factor(educ)
+ factor(smk) + factor(alc) + I(age^2), data = nhanes)
summary(sbp.model6.int) # Is the interaction term significant?
##
## Call:
## lm(formula = sbp ~ bpb * factor(sex) + age + factor(race) + bmi +
## factor(educ) + factor(smk) + factor(alc) + I(age^2), data = nhanes)
##
## Residuals:
## Min 1Q Median 3Q Max
## -58.894 -9.530 -1.369 7.642 102.461
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 87.8854794 1.9054676 46.123 < 2e-16 ***
## bpb 0.1317420 0.0923617 1.426 0.153825
## factor(sex)2 -5.1875500 0.7476820 -6.938 4.49e-12 ***
## age 0.3385020 0.0718234 4.713 2.51e-06 ***
## factor(race)2 2.0885416 0.5060825 4.127 3.74e-05 ***
## bmi 0.5905818 0.0395204 14.944 < 2e-16 ***
## factor(educ)2 0.1078127 0.4917151 0.219 0.826458
## factor(educ)3 -2.4434079 0.7269556 -3.361 0.000782 ***
## factor(smk)2 -1.7832085 0.5727587 -3.113 0.001860 **
## factor(smk)3 0.1024602 0.5759802 0.178 0.858818
## factor(alc)1 0.4787710 0.4929064 0.971 0.331436
## I(age^2) 0.0026405 0.0006854 3.853 0.000118 ***
## bpb:factor(sex)2 0.3796814 0.1532848 2.477 0.013284 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.36 on 4889 degrees of freedom
## (172 observations deleted due to missingness)
## Multiple R-squared: 0.3992, Adjusted R-squared: 0.3977
## F-statistic: 270.7 on 12 and 4889 DF, p-value: < 2.2e-16
Exercise 6A
## Logistic regression for hypertension
## Look at hypertension status (htn)
tab1(nhanes$htn, graph = F)
## nhanes$htn :
## Frequency Percent Cum. percent
## 0 3135 61.8 61.8
## 1 1939 38.2 100.0
## Total 5074 100.0 100.0
htn.model <- glm(htn ~ bpb + age + factor(sex) + factor(race) + bmi + factor(educ) + factor(smk) + factor(alc),
family = binomial,
na.action = na.omit, data = nhanes)
summary(htn.model)
##
## Call:
## glm(formula = htn ~ bpb + age + factor(sex) + factor(race) +
## bmi + factor(educ) + factor(smk) + factor(alc), family = binomial,
## data = nhanes, na.action = na.omit)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4998 -0.7949 -0.4272 0.8959 2.5147
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.635396 0.261535 -25.371 < 2e-16 ***
## bpb 0.023959 0.011750 2.039 0.0414 *
## age 0.061698 0.002235 27.602 < 2e-16 ***
## factor(sex)2 0.063134 0.077426 0.815 0.4148
## factor(race)2 0.494599 0.080020 6.181 6.37e-10 ***
## bmi 0.097071 0.006285 15.446 < 2e-16 ***
## factor(educ)2 0.075786 0.076618 0.989 0.3226
## factor(educ)3 -0.085220 0.114171 -0.746 0.4554
## factor(smk)2 -0.037067 0.086530 -0.428 0.6684
## factor(smk)3 0.080902 0.091358 0.886 0.3759
## factor(alc)1 0.008554 0.077201 0.111 0.9118
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6500.8 on 4901 degrees of freedom
## Residual deviance: 5080.9 on 4891 degrees of freedom
## (172 observations deleted due to missingness)
## AIC: 5102.9
##
## Number of Fisher Scoring iterations: 4
# Compute ORs
logistic.display(htn.model)
##
## OR lower95ci upper95ci Pr(>|Z|)
## bpb 1.0242485 1.0009304 1.048110 4.143898e-02
## age 1.0636409 1.0589913 1.068311 1.040118e-167
## factor(sex)2 1.0651698 0.9151960 1.239720 4.148329e-01
## factor(race)2 1.6398400 1.4018069 1.918292 6.372658e-10
## bmi 1.1019384 1.0884485 1.115596 8.036933e-54
## factor(educ)2 1.0787320 0.9283169 1.253519 3.225922e-01
## factor(educ)3 0.9183104 0.7341876 1.148609 4.554123e-01
## factor(smk)2 0.9636111 0.8132928 1.141712 6.683781e-01
## factor(smk)3 1.0842650 0.9065078 1.296879 3.758574e-01
## factor(alc)1 1.0085903 0.8669650 1.173351 9.117772e-01
# Regression diagnostics
par(mfrow = c(2, 2))
plot(htn.model)
par(mfrow = c(1, 1))
termplot(htn.model, se = T, partial.resid = T)
# ## Poisson regression for respiratory death: Montana dataset from epiDisplay
# data(Montana)
# summ(Montana)
# head(Montana, 10)
# hist(Montana$respdeath)
#
# par(mfrow = c(2, 2))
# tab1(Montana$agegr)
# tab1(Montana$period)
# tab1(Montana$start)
# tab1(Montana$arsenic)
#
# ## label categorical variables
# Montana$agegr <- factor(Montana$agegr, labels = c("40-49", "50-59", "60-69", "70-79"))
# Montana$period <- factor(Montana$period, labels = c("1938-1949", "1950-1959", "1960-1969", "1970-1977"))
# Montana$start <- factor(Montana$start, labels = c("pre-1925", "1925 & after"))
# Montana$arsenic <- factor(Montana$arsenic, labels = c("<1 year", "1-4 years", "5-14 years", "15+ years"))
#
# tab1(Montana$agegr, missing = F)
# tab1(Montana$period, missing = F)
# tab1(Montana$start, missing = F)
# tab1(Montana$arsenic, missing = F)
# par(mfrow = c(1, 1))
#
# ## Compute incidence rate by age and period
# table.pyears <- tapply(Montana$personyrs, list(Montana$period, Montana$agegr), sum)
# table.deaths <- tapply(Montana$respdeath, list(Montana$period, Montana$agegr), sum)
# table.inc10000 <- table.deaths / table.pyears * 10000
# table.inc10000
#
# ## create a time-series plot of the incidence
# plot.ts(table.inc10000, plot.type = "single", xlab = "", ylab = "#/10,000 person-years", xaxt = "n", col = c("black", "blue", "red", "green"), lty = c(2, 1, 1, 2), las = 1)
# points(rep(1:4, 4), table.inc10000, pch = 22, cex = table.pyears / sum(table.pyears) * 20)
# title(main = "Incidence by age and period")
# axis(side = 1, at = 1:4, labels = levels(Montana$period))
# legend("topleft", legend = levels(Montana$agegr)[4:1], col = c("green", "red", "blue", "black"), bg = "white", lty = c(2, 1, 1, 2))
#
# ## check arsenic
# tab1(Montana$arsenic)
# tapply(Montana$respdeath, Montana$arsenic, mean)
# tapply(Montana$personyrs, Montana$arsenic, mean)
#
# ## Poisson model
# resp.mode11 <- glm(respdeath ~ period, offset = log(personyrs), family = poisson, data = Montana)
# summary(resp.mode11)
#
# resp.mode12 <- glm(respdeath ~ agegr, offset = log(personyrs), family = poisson, data = Montana)
# summary(resp.mode12)
#
# resp.mode13 <- glm(respdeath ~ period + agegr, offset = log(personyrs), family = poisson, data = Montana)
# summary(resp.mode13)
#
# AIC(resp.mode11, resp.mode12, resp.mode13)
# ## model2 is better
#
# resp.mode14 <- glm(respdeath ~ agegr + arsenic, offset = log(personyrs), family = poisson, data = Montana)
# summary(resp.mode14)
#
# ## is there a linear trend across arsenic exposure?
# resp.mode14.lin <- glm(respdeath ~ agegr + as.numeric(arsenic), offset = log(personyrs), family = poisson, data = Montana)
# summary(resp.mode14.lin)
#
# ## compute IRR
# idr.display(resp.mode14)
Optional: Exercise 6B
# Matched case-control dataset available in the epiDisplay package (packages can contain practice data as well as functions)
data(VC1to1)
summ(VC1to1) # What is the shape/structure of this dataset?
##
## No. of observations = 52
##
## Var. name obs. mean median s.d. min. max.
## 1 matset 52 13.5 13.5 7.57 1 26
## 2 case 52 0.5 0.5 0.5 0 1
## 3 smoking 52 0.81 1 0.4 0 1
## 4 rubber 52 0.33 0 0.47 0 1
## 5 alcohol 52 0.52 1 0.5 0 1
head(VC1to1) # 1 case matched to 1 control
## matset case smoking rubber alcohol
## 1 1 1 1 0 0
## 2 1 0 1 0 0
## 3 2 1 1 0 1
## 4 2 0 1 1 0
## 5 3 1 1 1 0
## 6 3 0 1 1 0
# Reshape the data to facilitate data exploration
# function 'reshape' converts wide to long or vice versa
wide <- reshape(VC1to1, timevar = "case", v.names = c("smoking", "rubber", "alcohol"), idvar = "matset", direction = "wide")
head(wide, 3)
## matset smoking.1 rubber.1 alcohol.1 smoking.0 rubber.0 alcohol.0
## 1 1 1 0 0 1 0 0
## 3 2 1 0 1 1 1 0
## 5 3 1 1 0 1 1 0
table(wide$smoking.1, wide$smoking.0, dnn = c("smoking in case", "smoking in control"))
## smoking in control
## smoking in case 0 1
## 0 0 5
## 1 5 16
# dnn: dimnames names
# matchTab() is for the conditional OR (McNemar's OR)
matchTab(VC1to1$case, VC1to1$smoking, strata = VC1to1$matset) # Is smoking exposure associated with cancer?
##
## Exposure status: $ = 1
##
## Exposure status: VC1to1 = 1
##
## Exposure status: smoking = 1
##
## Total number of match sets in the tabulation = 26
##
## Number of controls = 1
## No. of controls exposed
## No. of cases exposed 0 1
## 0 0 5
## 1 5 16
##
## Odds ratio by Mantel-Haenszel method = 1
##
## Odds ratio by maximum likelihood estimate (MLE) method = 1
## 95%CI= 0.29 , 3.454
##
matchTab(VC1to1$case, VC1to1$rubber, strata = VC1to1$matset) # Is working in the rubber industry associated with cancer?
##
## Exposure status: $ = 1
##
## Exposure status: VC1to1 = 1
##
## Exposure status: rubber = 1
##
## Total number of match sets in the tabulation = 26
##
## Number of controls = 1
## No. of controls exposed
## No. of cases exposed 0 1
## 0 13 5
## 1 4 4
##
## Odds ratio by Mantel-Haenszel method = 0.8
##
## Odds ratio by maximum likelihood estimate (MLE) method = 0.8
## 95%CI= 0.215 , 2.979
##
matchTab(VC1to1$case, VC1to1$alcohol, strata = VC1to1$matset) # Is alcohol consumption associated iwth cancer?
##
## Exposure status: $ = 1
##
## Exposure status: VC1to1 = 1
##
## Exposure status: alcohol = 1
##
## Total number of match sets in the tabulation = 26
##
## Number of controls = 1
## No. of controls exposed
## No. of cases exposed 0 1
## 0 7 2
## 1 9 8
##
## Odds ratio by Mantel-Haenszel method = 4.5
##
## Odds ratio by maximum likelihood estimate (MLE) method = 4.5
## 95%CI= 0.972 , 20.827
##
## look at the full dataset VC1to6
data(VC1to6) # 1 case matched to up to 6 controls
summ(VC1to6)
##
## No. of observations = 119
##
## Var. name obs. mean median s.d. min. max.
## 1 matset 119 15.75 17 6.96 1 26
## 2 case 119 0.22 0 0.41 0 1
## 3 smoking 119 0.7 1 0.46 0 1
## 4 rubber 119 0.37 0 0.48 0 1
## 5 alcohol 119 0.42 0 0.5 0 1
VC1to6[, ]
## matset case smoking rubber alcohol
## 1 1 1 1 0 0
## 2 1 0 1 0 0
## 3 2 1 1 0 1
## 4 2 0 1 1 0
## 5 3 1 1 1 0
## 6 3 0 1 1 0
## 7 4 1 1 0 0
## 8 4 0 1 1 1
## 9 4 0 0 1 1
## 10 5 1 0 0 1
## 11 5 0 1 0 0
## 12 5 0 1 0 0
## 13 6 1 1 0 1
## 14 6 0 0 0 0
## 15 6 0 0 0 0
## 16 7 1 1 0 1
## 17 7 0 1 0 0
## 18 7 0 1 0 1
## 19 7 0 1 1 0
## 20 8 1 1 0 0
## 21 8 0 1 0 0
## 22 8 0 1 0 1
## 23 8 0 0 1 0
## 24 9 1 1 1 1
## 25 9 0 1 1 0
## 26 9 0 1 1 0
## 27 9 0 0 1 0
## 28 10 1 0 0 0
## 29 10 0 1 1 0
## 30 10 0 0 0 0
## 31 10 0 1 0 0
## 32 11 1 1 0 1
## 33 11 0 1 0 1
## 34 11 0 0 0 0
## 35 11 0 1 0 1
## 36 12 1 1 0 0
## 37 12 0 1 0 1
## 38 12 0 1 0 0
## 39 12 0 0 0 0
## 40 13 1 1 1 0
## 41 13 0 0 0 0
## 42 13 0 0 0 0
## 43 13 0 0 0 0
## 44 14 1 1 0 1
## 45 14 0 1 0 1
## 46 14 0 0 0 0
## 47 14 0 1 1 0
## 48 14 0 1 0 1
## 49 15 1 1 0 1
## 50 15 0 1 0 0
## 51 15 0 0 0 0
## 52 15 0 1 0 0
## 53 15 0 1 0 1
## 54 16 1 1 0 1
## 55 16 0 0 0 1
## 56 16 0 1 0 1
## 57 16 0 1 0 1
## 58 16 0 1 1 0
## 59 17 1 1 1 1
## 60 17 0 1 0 1
## 61 17 0 1 1 1
## 62 17 0 1 0 0
## 63 17 0 0 1 0
## 64 18 1 1 0 1
## 65 18 0 1 0 1
## 66 18 0 0 1 0
## 67 18 0 0 1 0
## 68 18 0 1 0 0
## 69 18 0 1 0 1
## 70 19 1 0 0 0
## 71 19 0 1 0 0
## 72 19 0 0 0 0
## 73 19 0 0 0 0
## 74 19 0 0 1 0
## 75 19 0 0 0 0
## 76 20 1 1 1 1
## 77 20 0 0 0 0
## 78 20 0 0 0 0
## 79 20 0 0 0 0
## 80 20 0 1 0 0
## 81 20 0 0 0 0
## 82 21 1 1 1 1
## 83 21 0 1 1 1
## 84 21 0 1 1 1
## 85 21 0 1 0 1
## 86 21 0 1 1 1
## 87 21 0 1 1 1
## 88 21 0 1 1 1
## 89 22 1 0 0 1
## 90 22 0 1 1 1
## 91 22 0 0 0 1
## 92 22 0 1 1 0
## 93 22 0 0 0 0
## 94 22 0 0 1 0
## 95 22 0 1 0 0
## 96 23 1 1 1 1
## 97 23 0 1 1 1
## 98 23 0 1 1 1
## 99 23 0 1 1 0
## 100 23 0 1 1 1
## 101 23 0 1 1 0
## 102 23 0 1 1 0
## 103 24 1 1 1 1
## 104 24 0 1 0 0
## 105 24 0 1 0 0
## 106 24 0 0 1 1
## 107 24 0 1 0 0
## 108 24 0 1 0 1
## 109 24 0 1 0 0
## 110 25 1 0 0 0
## 111 25 0 1 1 0
## 112 25 0 1 1 1
## 113 25 0 1 0 1
## 114 25 0 1 0 0
## 115 26 1 1 0 1
## 116 26 0 0 0 0
## 117 26 0 1 1 0
## 118 26 0 0 0 0
## 119 26 0 1 1 1
# what is the effect of smoking?
matchTab(VC1to6$case, VC1to6$smoking, strata = VC1to6$matset) # Is smoking exposure associated with cancer?
##
## Exposure status: $ = 1
##
## Exposure status: VC1to6 = 1
##
## Exposure status: smoking = 1
##
## Total number of match sets in the tabulation = 26
##
## Number of controls = 1
## No. of controls exposed
## No. of cases exposed 0 1
## 0 0 0
## 1 0 3
##
## Number of controls = 2
## No. of controls exposed
## No. of cases exposed 0 1 2
## 0 0 0 1
## 1 1 1 0
##
## Number of controls = 3
## No. of controls exposed
## No. of cases exposed 0 1 2 3
## 0 0 0 1 0
## 1 1 0 4 1
##
## Number of controls = 4
## No. of controls exposed
## No. of cases exposed 0 1 2 3 4
## 0 0 0 0 0 1
## 1 0 0 1 4 0
##
## Number of controls = 5
## No. of controls exposed
## No. of cases exposed 0 1 2 3 4 5
## 0 0 1 0 0 0 0
## 1 0 1 0 1 0 0
##
## Number of controls = 6
## No. of controls exposed
## No. of cases exposed 0 1 2 3 4 5 6
## 0 0 0 0 1 0 0 0
## 1 0 0 0 0 0 1 2
##
## Odds ratio by Mantel-Haenszel method = 1.988
##
## Odds ratio by maximum likelihood estimate (MLE) method = 2.066
## 95%CI= 0.678 , 6.301
##
matchTab(VC1to6$case, VC1to6$alcohol, strata = VC1to6$matset)
##
## Exposure status: $ = 1
##
## Exposure status: VC1to6 = 1
##
## Exposure status: alcohol = 1
##
## Total number of match sets in the tabulation = 26
##
## Number of controls = 1
## No. of controls exposed
## No. of cases exposed 0 1
## 0 2 0
## 1 1 0
##
## Number of controls = 2
## No. of controls exposed
## No. of cases exposed 0 1 2
## 0 0 0 1
## 1 2 0 0
##
## Number of controls = 3
## No. of controls exposed
## No. of cases exposed 0 1 2 3
## 0 2 2 0 0
## 1 1 1 1 0
##
## Number of controls = 4
## No. of controls exposed
## No. of cases exposed 0 1 2 3 4
## 0 0 0 1 0 0
## 1 0 2 2 1 0
##
## Number of controls = 5
## No. of controls exposed
## No. of cases exposed 0 1 2 3 4 5
## 0 1 0 0 0 0 0
## 1 1 0 1 0 0 0
##
## Number of controls = 6
## No. of controls exposed
## No. of cases exposed 0 1 2 3 4 5 6
## 0 0 0 0 0 0 0 0
## 1 0 0 2 1 0 0 1
##
## Odds ratio by Mantel-Haenszel method = 5.386
##
## Odds ratio by maximum likelihood estimate (MLE) method = 5.655
## 95%CI= 1.811 , 17.659
##
# conditional logistic reg using clogit from survival package
### load the survival package
library(survival)
clogit1 <- clogit(case ~ smoking + alcohol + strata(matset), data = VC1to1) # 1 to 1 match dataset
summary(clogit1) # What covariates are associated?
## Call:
## coxph(formula = Surv(rep(1, 52L), case) ~ smoking + alcohol +
## strata(matset), data = VC1to1, method = "exact")
##
## n= 52, number of events= 26
##
## coef exp(coef) se(coef) z Pr(>|z|)
## smoking -0.3142 0.7304 0.7079 -0.444 0.6572
## alcohol 1.5715 4.8141 0.8030 1.957 0.0503 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## smoking 0.7304 1.3691 0.1824 2.925
## alcohol 4.8141 0.2077 0.9978 23.227
##
## Concordance= 0.673 (se = 0.101 )
## Likelihood ratio test= 5.02 on 2 df, p=0.08
## Wald test = 3.83 on 2 df, p=0.1
## Score (logrank) test = 4.62 on 2 df, p=0.1
clogit2 <- clogit(case ~ smoking + alcohol + strata(matset), data = VC1to6) # 1 to 6 match dataset
summary(clogit2) # What covariates are associated?
## Call:
## coxph(formula = Surv(rep(1, 119L), case) ~ smoking + alcohol +
## strata(matset), data = VC1to6, method = "exact")
##
## n= 119, number of events= 26
##
## coef exp(coef) se(coef) z Pr(>|z|)
## smoking 0.3619 1.4361 0.6244 0.580 0.56217
## alcohol 1.6552 5.2342 0.5899 2.806 0.00502 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## smoking 1.436 0.6963 0.4224 4.883
## alcohol 5.234 0.1911 1.6471 16.633
##
## Concordance= 0.694 (se = 0.065 )
## Likelihood ratio test= 11.49 on 2 df, p=0.003
## Wald test = 9.13 on 2 df, p=0.01
## Score (logrank) test = 11.04 on 2 df, p=0.004
# compute ORs
clogistic.display(clogit1)
## Conditional logistic regression predicting case : 1 vs 0
##
##
## crude OR(95%CI) adj. OR(95%CI) P(Wald's test) P(LR-test)
## smoking: 1 vs 0 1 (0.29,3.45) 0.73 (0.18,2.92) 0.657 0.655
##
## alcohol: 1 vs 0 4.5 (0.97,20.83) 4.81 (1,23.23) 0.05 0.025
##
## No. of observations = 52
clogistic.display(clogit2)
## Conditional logistic regression predicting case : 1 vs 0
##
##
## crude OR(95%CI) adj. OR(95%CI) P(Wald's test)
## smoking: 1 vs 0 2.07 (0.68,6.3) 1.44 (0.42,4.88) 0.562
##
## alcohol: 1 vs 0 5.66 (1.81,17.66) 5.23 (1.65,16.63) 0.005
##
## P(LR-test)
## smoking: 1 vs 0 0.559
##
## alcohol: 1 vs 0 0.002
##
## No. of observations = 119
tab1(nhanes$d_total) # Variable for death due to any cause in NHANES
## nhanes$d_total :
## Frequency %(NA+) %(NA-)
## 0 3794 74.8 74.8
## 1 1275 25.1 25.2
## NaN 5 0.1 0.0
## Total 5074 100.0 100.0
summ(nhanes$pmon_mec) # Number of months of follow up
## obs. mean median s.d. min. max.
## 5069 158.7 170 49.481 0 217
### Define Surv() object to use in later functions
surv.total <- Surv(nhanes$pmon_mec, nhanes$d_total)
surv.total[1:10] # Includes information on time of follow up and whether the person died, denoted with a "+"
## [1] 203+ 196+ 215+ 132 184+ 143 202+ 206+ 187+ 115
### K-M Life table and curve
fit.total <- survfit(Surv(nhanes$pmon_mec, nhanes$d_total) ~ 1) # Model with no predictors, just outcome
#summary(fit.total)
summary(nhanes$pmon_mec/12) # How many years of follow up do you have?
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00 12.42 14.17 13.22 16.08 18.08 5
fit.total # How many people died over this time period?
## Call: survfit(formula = Surv(nhanes$pmon_mec, nhanes$d_total) ~ 1)
##
## 5 observations deleted due to missingness
## n events median 0.95LCL 0.95UCL
## 5069 1275 NA NA NA
plot(fit.total)
## suppress 95% CI lines and the time marks for censored subjects.
plot(fit.total, ylim = c(0.7, 1.0), conf.int = F, mark.time = F, ylab="Probability of survival", xlab="Months")
### Survival by different levels of covariates
fit.total.sex <- survfit(Surv(nhanes$pmon_mec, nhanes$d_total) ~ nhanes$sex)
fit.total.sex # How many people died in each sex group?
## Call: survfit(formula = Surv(nhanes$pmon_mec, nhanes$d_total) ~ nhanes$sex)
##
## 5 observations deleted due to missingness
## n events median 0.95LCL 0.95UCL
## nhanes$sex=1 2332 679 NA NA NA
## nhanes$sex=2 2737 596 NA NA NA
#summary(fit.total.sex)[1:10]
plot(fit.total.sex, ylim = c(0.6, 1.0), col = c("blue", "red"), lty = c(1, 2), mark.time = F, main = "Kaplan-Meier curve", xlab = "Time (months)", ylab = "Survival probability")
legend("bottomleft", legend = c("Men", "Women"), col = c("blue", "red"), lty = c(1, 2))
### Test for differences in survival curves
survdiff(Surv(nhanes$pmon_mec, nhanes$d_total) ~ nhanes$sex) # Is there a difference in survival among males and females?
## Call:
## survdiff(formula = Surv(nhanes$pmon_mec, nhanes$d_total) ~ nhanes$sex)
##
## n=5069, 5 observations deleted due to missingness.
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## nhanes$sex=1 2332 679 573 19.7 35.8
## nhanes$sex=2 2737 596 702 16.1 35.8
##
## Chisq= 35.8 on 1 degrees of freedom, p= 2e-09
cox.bpb <- coxph(Surv(nhanes$pmon_mec, nhanes$d_total) ~ nhanes$bpb)
summary(cox.bpb) # Is blood Pb associated with survival? Easier to visualize in categories of exposure
## Call:
## coxph(formula = Surv(nhanes$pmon_mec, nhanes$d_total) ~ nhanes$bpb)
##
## n= 5069, number of events= 1275
## (5 observations deleted due to missingness)
##
## coef exp(coef) se(coef) z Pr(>|z|)
## nhanes$bpb 0.070662 1.073219 0.005597 12.63 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## nhanes$bpb 1.073 0.9318 1.062 1.085
##
## Concordance= 0.62 (se = 0.008 )
## Likelihood ratio test= 113.3 on 1 df, p=<2e-16
## Wald test = 159.4 on 1 df, p=<2e-16
## Score (logrank) test = 153.9 on 1 df, p=<2e-16
bpb3 <- cut2(nhanes$bpb, g = 3)
tab1(bpb3) # Tertiles of blood Pb
## bpb3 :
## Frequency Percent Cum. percent
## [0.7, 2.4) 1760 34.7 34.7
## [2.4, 4.4) 1672 33.0 67.6
## [4.4,48.1] 1642 32.4 100.0
## Total 5074 100.0 100.0
# K-M Life table and curve
fit.total.bpb3 <- survfit(Surv(nhanes$pmon_mec, nhanes$d_total) ~ bpb3)
fit.total.bpb3
## Call: survfit(formula = Surv(nhanes$pmon_mec, nhanes$d_total) ~ bpb3)
##
## 5 observations deleted due to missingness
## n events median 0.95LCL 0.95UCL
## bpb3=[0.7, 2.4) 1758 248 NA NA NA
## bpb3=[2.4, 4.4) 1670 451 NA NA NA
## bpb3=[4.4,48.1] 1641 576 NA NA NA
plot(fit.total.bpb3, col = c(1:3), lty = c(1:3), ylim = c(0.6, 1.0), main = "Survival in relation to blood lead levels", xlab = "Time (months)", ylab = "Survival probability")
legend(30, 0.7, legend = c("Q1", "Q2", "Q3"), lty = c(1:3), col = c(1:3))
# crude
cox.bpb3 <- coxph(Surv(nhanes$pmon_mec, nhanes$d_total) ~ bpb3)
summary(cox.bpb3)
## Call:
## coxph(formula = Surv(nhanes$pmon_mec, nhanes$d_total) ~ bpb3)
##
## n= 5069, number of events= 1275
## (5 observations deleted due to missingness)
##
## coef exp(coef) se(coef) z Pr(>|z|)
## bpb3[2.4, 4.4) 0.70388 2.02158 0.07906 8.903 <2e-16 ***
## bpb3[4.4,48.1] 1.00452 2.73061 0.07599 13.219 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## bpb3[2.4, 4.4) 2.022 0.4947 1.731 2.360
## bpb3[4.4,48.1] 2.731 0.3662 2.353 3.169
##
## Concordance= 0.605 (se = 0.007 )
## Likelihood ratio test= 196.5 on 2 df, p=<2e-16
## Wald test = 174.7 on 2 df, p=<2e-16
## Score (logrank) test = 186.5 on 2 df, p=<2e-16
# adjusted
cox.bpb3.adj <- coxph(Surv(nhanes$pmon_mec, nhanes$d_total) ~ bpb3 + nhanes$age + factor(nhanes$sex) + factor(nhanes$race) + factor(nhanes$educ) + factor(nhanes$smk) + factor(nhanes$alc))
summary(cox.bpb3.adj)
## Call:
## coxph(formula = Surv(nhanes$pmon_mec, nhanes$d_total) ~ bpb3 +
## nhanes$age + factor(nhanes$sex) + factor(nhanes$race) + factor(nhanes$educ) +
## factor(nhanes$smk) + factor(nhanes$alc))
##
## n= 4905, number of events= 1211
## (169 observations deleted due to missingness)
##
## coef exp(coef) se(coef) z Pr(>|z|)
## bpb3[2.4, 4.4) 0.069004 1.071440 0.081665 0.845 0.39813
## bpb3[4.4,48.1] 0.084549 1.088226 0.083037 1.018 0.30858
## nhanes$age 0.088495 1.092529 0.002373 37.287 < 2e-16 ***
## factor(nhanes$sex)2 -0.294962 0.744560 0.065259 -4.520 6.19e-06 ***
## factor(nhanes$race)2 0.178596 1.195537 0.070922 2.518 0.01180 *
## factor(nhanes$educ)2 0.001651 1.001652 0.062588 0.026 0.97895
## factor(nhanes$educ)3 -0.205472 0.814263 0.102174 -2.011 0.04432 *
## factor(nhanes$smk)2 0.171628 1.187237 0.070372 2.439 0.01473 *
## factor(nhanes$smk)3 0.631970 1.881314 0.082193 7.689 1.48e-14 ***
## factor(nhanes$alc)1 -0.182403 0.833265 0.067767 -2.692 0.00711 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## bpb3[2.4, 4.4) 1.0714 0.9333 0.9130 1.2574
## bpb3[4.4,48.1] 1.0882 0.9189 0.9248 1.2806
## nhanes$age 1.0925 0.9153 1.0875 1.0976
## factor(nhanes$sex)2 0.7446 1.3431 0.6552 0.8462
## factor(nhanes$race)2 1.1955 0.8364 1.0404 1.3738
## factor(nhanes$educ)2 1.0017 0.9984 0.8860 1.1324
## factor(nhanes$educ)3 0.8143 1.2281 0.6665 0.9948
## factor(nhanes$smk)2 1.1872 0.8423 1.0343 1.3628
## factor(nhanes$smk)3 1.8813 0.5315 1.6014 2.2102
## factor(nhanes$alc)1 0.8333 1.2001 0.7296 0.9516
##
## Concordance= 0.857 (se = 0.005 )
## Likelihood ratio test= 2351 on 10 df, p=<2e-16
## Wald test = 1629 on 10 df, p=<2e-16
## Score (logrank) test = 2328 on 10 df, p=<2e-16
### Test for the proportional hazards assumption
test.prop <- cox.zph(cox.bpb3.adj) # Do any of the variables violate the proportional hazards assumption? May consider stratifying by sex.
test.prop
## chisq df p
## bpb3 0.7663 2 0.682
## nhanes$age 1.3879 1 0.239
## factor(nhanes$sex) 4.4387 1 0.035
## factor(nhanes$race) 0.3733 1 0.541
## factor(nhanes$educ) 0.1618 2 0.922
## factor(nhanes$smk) 0.4093 2 0.815
## factor(nhanes$alc) 0.0235 1 0.878
## GLOBAL 8.0277 10 0.626
## Display a graph of the scaled Schoenfeld residuals, along with a smooth curve
plot(test.prop) # for all variables
plot(test.prop, var = 4) + # Can call up variables indivdually, here for sex
abline(h = 0, lty = 3, col = 2)
## integer(0)
# Stratify by sex
cox.bpb3.adj1 <- coxph(Surv(nhanes$pmon_mec, nhanes$d_total) ~ bpb3 + nhanes$age + strata(nhanes$sex) + factor(nhanes$race) + factor(nhanes$educ) + factor(nhanes$smk) + factor(nhanes$alc))
summary(cox.bpb3.adj1)
## Call:
## coxph(formula = Surv(nhanes$pmon_mec, nhanes$d_total) ~ bpb3 +
## nhanes$age + strata(nhanes$sex) + factor(nhanes$race) + factor(nhanes$educ) +
## factor(nhanes$smk) + factor(nhanes$alc))
##
## n= 4905, number of events= 1211
## (169 observations deleted due to missingness)
##
## coef exp(coef) se(coef) z Pr(>|z|)
## bpb3[2.4, 4.4) 0.069557 1.072033 0.081702 0.851 0.39458
## bpb3[4.4,48.1] 0.088475 1.092506 0.083082 1.065 0.28692
## nhanes$age 0.088456 1.092486 0.002374 37.260 < 2e-16 ***
## factor(nhanes$race)2 0.178662 1.195616 0.070942 2.518 0.01179 *
## factor(nhanes$educ)2 0.001237 1.001238 0.062599 0.020 0.98424
## factor(nhanes$educ)3 -0.209755 0.810783 0.102180 -2.053 0.04009 *
## factor(nhanes$smk)2 0.166505 1.181170 0.070394 2.365 0.01801 *
## factor(nhanes$smk)3 0.631710 1.880825 0.082187 7.686 1.51e-14 ***
## factor(nhanes$alc)1 -0.179672 0.835545 0.067736 -2.653 0.00799 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## bpb3[2.4, 4.4) 1.0720 0.9328 0.9134 1.2582
## bpb3[4.4,48.1] 1.0925 0.9153 0.9283 1.2857
## nhanes$age 1.0925 0.9153 1.0874 1.0976
## factor(nhanes$race)2 1.1956 0.8364 1.0404 1.3740
## factor(nhanes$educ)2 1.0012 0.9988 0.8856 1.1319
## factor(nhanes$educ)3 0.8108 1.2334 0.6636 0.9906
## factor(nhanes$smk)2 1.1812 0.8466 1.0289 1.3559
## factor(nhanes$smk)3 1.8808 0.5317 1.6010 2.2096
## factor(nhanes$alc)1 0.8355 1.1968 0.7317 0.9542
##
## Concordance= 0.855 (se = 0.005 )
## Likelihood ratio test= 2315 on 9 df, p=<2e-16
## Wald test = 1599 on 9 df, p=<2e-16
## Score (logrank) test = 2294 on 9 df, p=<2e-16
cox.zph(cox.bpb3.adj1)
## chisq df p
## bpb3 0.443 2 0.80
## nhanes$age 1.321 1 0.25
## factor(nhanes$race) 0.404 1 0.53
## factor(nhanes$educ) 0.156 2 0.93
## factor(nhanes$smk) 1.038 2 0.60
## factor(nhanes$alc) 0.476 1 0.49
## GLOBAL 3.675 9 0.93
## Let's make a simple macro that calculates the mean and standard deviation at the same time.
# get the results in vector form
mystats <- function(x) {
mymean <- mean(x, na.rm = T)
mysd <- sd(x, na.rm = T)
c(mean = mymean, sd = mysd)
}
mystats(nhanes$age)
## mean sd
## 48.74359 19.29721
mystats(nhanes$bpb)
## mean sd
## 3.958041 3.227670
# Assume that you are examining age-adjusted associations of SBP with from the 13th variables (bmi) to 25th variables (packyrs) (n=13)
# you want to run 13 linear regression models and save beta's and p-values
summ(nhanes)
##
## No. of observations = 5074
##
## Var. name obs. mean median s.d. min. max.
## 1 strata 5074 23.26 22 13.49 1 49
## 2 seqn 5074 27086.17 32365.5 17087.06 3 53594
## 3 race 5074 1.28 1 0.45 1 2
## 4 sex 5074 1.54 2 0.5 1 2
## 5 age 5074 48.74 46 19.3 20 90
## 6 urban 5074 1.52 2 0.5 1 2
## 7 region 5074 2.75 3 0.95 1 4
## 8 pir 4587 2.46 2.01 1.79 0 11.89
## 9 psu 5074 1.51 2 0.5 1 2
## 10 wt_mh 5074 10416.3 4495.92 13014 225.93 139744.91
## 11 sbp 5074 126.26 122 19.96 80 237
## 12 dbp 5074 74.52 74 10.57 32 134
## 13 bmi 5064 27.24 26.4 5.82 14.4 68.5
## 14 hematoc 5013 41.43 41.45 4.17 19.1 57.35
## 15 bpb 5074 3.96 3.2 3.23 0.7 48.1
## 16 chol 5016 207.21 204 45.5 83 676
## 17 trig 5006 147.21 115 122.71 22 3616
## 18 scalc 4942 9.27 9.3 0.43 7.3 13.9
## 19 creat 4942 1.09 1 0.4 0.4 13.9
## 20 calc 5074 50852.16 50049 25961.73 6 90842
## 21 sodium 5074 8121.3 3155 13206.1 0 99712
## 22 potass 5074 854.65 0.55 8176.14 0.01 97800
## 23 educ 5041 1.73 2 0.68 1 3
## 24 smk 5074 1.75 1 0.83 1 3
## 25 packyrs 4856 9.58 0 22.02 0 456
## 26 diag_ca 5074 0.07 0 0.26 0 1
## 27 diag_dm 5065 0.09 0 0.29 0 1
## 28 diag_ht 5035 0.28 0 0.45 0 1
## 29 alc 4942 0.44 0 0.5 0 1
## 30 phyact 5074 0.77 1 0.42 0 1
## 31 med_ht 5029 0.17 0 0.37 0 1
## 32 htn 5074 0.38 0 0.49 0 1
## 33 d_total 5069 0.25 0 0.43 0 1
## 34 pmon_int 5069 159.6 171 49.49 1 218
## 35 pmon_mec 5069 158.7 170 49.48 0 217
## 36 d_cancer 5069 0.05 0 0.22 0 1
## 37 d_cvd 5069 0.11 0 0.32 0 1
## 38 sex1 5074 1.54 2 0.498 1 2
## 39 AGE5b 5074 2.942 3 1.425 1 5
## 40 AGE5c 5074 2.516 2 1.55 1 5
## 41 age5c 5074 2.52 2 1.55 1 5
test.var <- nhanes[, c(13:25)]
head(test.var)
## bmi hematoc bpb chol trig scalc creat calc sodium potass educ smk
## 1 25.5 43.80 5.0 268 174 9.5 1.1 70033 460 0.3099999 2 1
## 2 29.4 46.70 2.0 225 109 9.4 1.2 70385 38591 0.5103998 3 1
## 3 44.4 49.80 6.0 162 89 9.6 1.2 40040 3050 0.5599999 1 3
## 4 37.5 47.70 15.5 212 479 10.6 1.0 10003 1321 0.8099999 1 3
## 5 23.6 40.60 11.3 202 96 9.3 1.0 40021 5295 0.6499996 1 1
## 6 31.1 37.85 7.2 186 300 8.8 1.3 80005 831 0.7099996 1 3
## packyrs
## 1 0.0
## 2 0.0
## 3 10.0
## 4 2.0
## 5 0.0
## 6 17.5
# example
mod <- lm(sbp ~ bmi + age, data = nhanes, na.action = na.omit)
summary(mod)
##
## Call:
## lm(formula = sbp ~ bmi + age, data = nhanes, na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -58.828 -10.192 -1.445 8.221 98.816
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 82.71612 1.17680 70.29 <2e-16 ***
## bmi 0.50219 0.03795 13.23 <2e-16 ***
## age 0.61277 0.01146 53.47 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.71 on 5061 degrees of freedom
## (10 observations deleted due to missingness)
## Multiple R-squared: 0.3793, Adjusted R-squared: 0.3791
## F-statistic: 1547 on 2 and 5061 DF, p-value: < 2.2e-16
summary(mod)$coef[2, 1] # how to extract beta for bmi
## [1] 0.5021861
summary(mod)$coef[2, 4] # how to extract p-value for bmi
## [1] 2.500308e-39
RegressResults <- function(data, y, cov) {
nvar <- ncol(data)
newdata <- data.frame(cbind(data, y, cov))
tmatrix <- data.frame(matrix(NA, 2, nvar)) # 2 rows
colnames(tmatrix) <- colnames(data) # create a row for each var
rownames(tmatrix) <- c("beta", "p")
for (i in 1:nvar) {
ind <- data[, i]
model <- lm(y ~ ind + cov, data = newdata, na.action = na.omit)
tmatrix[1, i] <- summary(model)$coef[2, 1]
tmatrix[2, i] <- summary(model)$coef[2, 4]
}
return(tmatrix)
}
reg_output <- RegressResults(test.var, nhanes$sbp, nhanes$age) # Run the function
write.csv(reg_output, file = file.path(directory, "sbp.results.csv")) # Look in your output directory for the beta coefficients and p-values for all covariates!
Optional: Exercise 6C